% Also note that the "draftcls" or "draftclsnofoot", not "draft", option
% should be used if it is desired that the figures are to be displayed in
% draft mode.
%
\documentclass[10pt,journal,final]{IEEEtran}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{graphicx}   % if you want to include graphics files
\usepackage{verbatim}
\usepackage{subfig}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage[section] {placeins}
\usepackage{url}
\usepackage{multirow}
\usepackage{bbding}
%\usepackage[boxed, noline, linesnumbered, figure]{algorithm2e}
    

\newtheorem{thm}{Theorem}
\newtheorem{cor}[thm]{Corollary}
\newtheorem{lem}[thm]{Lemma}
\newtheorem{claim}[thm]{Claim}
\newtheorem{observation}[thm]{Observation}

\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}

\floatstyle{ruled}
\newfloat{program}{thp}{lop}
\floatname{program}{Algorithm}







\ifCLASSINFOpdf
\else
\fi

\usepackage{fixltx2e}
\usepackage{stfloats}
\hyphenation{op-tical net-works semi-conduc-tor}


\begin{document}
%
% paper title
% can use linebreaks \\ within to get better formatting as desired
\title{Distance and time matter: Spatio-temporal attributes of tuberculosis patients via coupled matrix-tensor and matrix-matrix factorization}
\author{Cagri~Ozcaglar,
        B\"{u}lent~Yener,
        Kristin~P.~Bennett% <-this % stops a space
%\thanks{M. Shell is with the Department
%of Electrical and Computer Engineering, Georgia Institute of Technology, Atlanta,
%GA, 30332 USA e-mail: (see http://www.michaelshell.org/contact.html).}% <-this % stops a space
%\thanks{J. Doe and J. Doe are with Anonymous University.}% <-this % stops a space
%\thanks{Manuscript received April 19, 2005; revised January 11, 2007.}
%\thanks{Rensselaer Polytechnic Institute}
\thanks{C. Ozcaglar and B. Yener are with the Computer Science
Department, Rensselaer Polytechnic Institute, Troy, NY USA (e-mail: ozcagc2@cs.rpi.edu; yener@cs.rpi.edu).}
\thanks{K. P. Bennett is with the Computer Science Department and Mathematical
Sciences Department, Rensselaer Polytechnic Institute, Troy, NY USA (e-mail: bennek@rpi.edu).}
}

% The paper headers
\markboth{}%
{Shell \MakeLowercase{\textit{et al.}}: Bare Demo of IEEEtran.cls for Journals}
% use for special paper notices
%\IEEEspecialpapernotice{(Invited Paper)}

% make the title area
\maketitle


\begin{abstract}
%\boldmath
\end{abstract}
% IEEEtran.cls defaults to using nonbold math in the Abstract.
% This preserves the distinction between vectors and scalars. However,
% if the journal you are submitting to favors bold math in the abstract,
% then you can use LaTeX's standard command \boldmath at the very start
% of the abstract to achieve this. Many IEEE journals frown on math
% in the abstract anyway.

% Note that keywords are not normally used for peerreview papers.
\begin{IEEEkeywords}
%tuberculosis, \emph{Mycobacterium tuberculosis} complex, DR region, spoligotype, MIRU-VNTR, mutation model.
\end{IEEEkeywords}

\IEEEpeerreviewmaketitle

%\IEEEPARstart{T}{his} demo file is intended to serve as a ``starter file''

\section{Introduction}

\section{Background}

\section{Methods}

\subsection{The dataset:}
The NYC dataset consists of 2187 patients in the United States detected from 2001 to 2007. Country of birth of each patient is available in the dataset, along with the spoligotype of MTBC strain which infected the patient and date of diagnosis. There are 353 unique spoligotypes in the dataset. MTBC strains are labeled by major lineages based on their spoligotypes using Conformal Bayesian Network (CBN) model \cite{CBN}. Using this dataset, we construct the host-pathogen tensor (HPT) of the form \emph{Patient Attributes} $\times$ \emph{Spoligotypes} $\times$ \emph{Time}, where we use country of birth as the patient attribute. The final HPT is $\mathbf{\underline{X}} \in \mathbb{R}^{(I=111) \times (J=353) \times (K=7)}$. The host-pathogen tensor (HPT) is shown in Figure \ref{HPT}.

\begin{figure}[H]
\centering
\includegraphics[width=3.0in]{Figures/HPT.pdf}
\caption{Host-pathogen tensor (HPT).}
\label{HPT}
\end{figure}

\subsection{Distance matrices}
In the host-pathogen tensor, the first mode represents host attributes, in this case country of birth. Proximity of countries can be found based on neighbourhood. Patients from close countries based on the proximity values are more likely to be involved in the same transmission event. Similarly, the second mode represent pathogen attributes, in this case spoligotypes. Genetic proximity of spoligotypes can be found using genetic distance measures. Hosts with close spoligotypes are more likely to be involved in the same mutation event.

\subsubsection{Spatial proximity matrix}
Given 111 countries, we first define the Country Neighbourhood Matrix (CNM). Given two countries $C_i$ and $C_j$, the CNM is defined as follows:

%\begin{figure}[!]
\begin{equation*}\label{CNM}
%\boxed{
\text{CNM}(C_i, C_j) = \begin{cases}
\displaystyle 1 \text{,} & \quad \text{if } C_i \text{ and } C_j \text{ are neighbours} \\
0 \text{,} & \quad \text{otherwise.} \end{cases}
%}
\end{equation*}
%\end{figure}

Let $L(C_i, C_j)$ be the length of shortest path from $C_i$ to $C_j$ based on Dijkstra's shortest path algorithm on CNM. Then, we define the spatial likelihood matrix as follows:

\begin{equation*}\label{P_S}
%\boxed{
P_S(C_i, C_j) = \begin{cases}
\displaystyle \frac{1}{1+L(C_i,C_j)} \text{,} & \quad \text{if } i\neq j \text{, } L(C_i, C_j) <=3 \\
1 \text{,} & \quad \text{if } i=j \\
0 \text{,} & \quad \text{otherwise.} \end{cases}
%}
\end{equation*}

Spatial proximity matrix $P_S$ has values inversely proportional to the length of shortest path between two countries, as long as the shortest-path length is at most 3. For country pairs with shortest-path length $L(C_i, C_j) > 3$, the proximity between two countries is set to zero. As a result, spatial proximity matrix reflects the likelihood of patients from two countries being involved in the same transmission event.

\subsubsection{Genetic proximity matrix}
Given 353 distinct spoligotypes, we define a proximity measure between them. Mutation of spoligotypes is based on the Contiguous Deletion Assumption (CDA), which states that one or more contiguous spacers can be deleted in a mutation event, but not gained. Let $s_i$ represent spoligotype $i$, and let $s_i \rightarrow s_j$ represent the mutation of spoligotype $s_i$ to spoligotype $s_j$. Then, we define the CDA matrix, which summarizes contiguous deletion assumption, as follows:

\begin{equation*}\label{CDA}
%\boxed{
\text{CDA}(s_i, s_j) = \begin{cases}
\text{true} \text{,} & \quad \text{if } s_i \rightarrow s_j \text{ or } s_j \rightarrow s_i\\
\text{false} \text{,} & \quad \text{otherwise.} \end{cases}
%}
\end{equation*}

Let $H(s_i, s_j)$ be the Hamming distance between spoligotypes $s_i$ and $s_j$, as defined in \cite{InsightsIntoBIBM2011}:

\begin{equation*}
H\left(s_i, s_j\right) = \displaystyle\sum_{r=1}^{43} \mid s_{ir} - s_{jr}\mid
\end{equation*}

\noindent where $s_{ir}$ represents the value of $r-th$ spacer of spoligotype $s_i$. Then, we define the genetic proximity matrix as follows:

\begin{equation*}\label{P_G}
%\boxed{
P_G(s_i, s_j) = \begin{cases}
\displaystyle \frac{1}{1+H(s_i,s_j)} \text{,} & \quad \text{if } i\neq j \text{, } \text{CDA}(s_i, s_j) \text{, } \\
                                              & \quad H(s_i, s_j) <=10 \\
1 \text{,} & \quad \text{if } i=j \\
0 \text{,} & \quad \text{otherwise.} \end{cases}
%}
\end{equation*}

Genetic proximity matrix $P_G$ has values inversely proportional to the Hamming distance between two spoligotypes, as long as the Hamming distance between them is at most 10. For spoligotype pairs with $P_G(s_i, s_j) > 10$, the genetic proximity is set to zero. Therefore, genetic proximity matrix reflects the likelihood of two different hosts being involved in the same mutation event.

\subsection{CMTCF: Coupled matrix-tensor clustering framework}
In order to analyze the host-pathogen tensor, spatial proximity matrix and genetic proximity matrix, we couple the tensor with matrices. For this purpose, we propose the Coupled matrix-tensor clustering framework (CMCTF). In the first step, we couple the tensor with matrices in various modes of the tensor. In the second step, we decompose the coupled matrix-tensor with two algorithms: CMTF\_PARAFAC\_ALS, also known as CMTF\_ALS in the literature; and CMTF\_Tucker\_ALS, a novel algorithm as an extension of CMTF\_ALS to Tucker3. The next two steps are borrowed from our earlier work, the Tensor Clustering Framework (TCF) \cite{TCF_MBT_BMC}. In the third step, we cluster the data points in the mode of interest using \texttt{kmeans\_mtimes\_seeded}, and validate clusters using Best-Match stability and F-measure. Figure \ref{CMTCF} shows the steps of CMTCF.

\begin{figure}[!t]
\centering
\includegraphics[width=2.5in]{Figures/CMTCF.pdf}
\caption{Coupled Matrix-tensor Clustering Framework (CMTCF).}
\label{CMTCF}
\end{figure}

\subsubsection{Coupled matrix-tensor generation - Data configurations}
The host-pathogen tensor $\mathbf{\underline{X}} \in \mathbb{R}^{I \times J \times K}$ are related to spatial proximity matrix $\mathbf{Y} \in \mathbb{R}^{I \times M}$ and genetic proximity matrix $\mathbf{Z} \in \mathbb{R}^{J \times N}$, and can be coupled with them. This relation leads to different data configurations to be analyzed simultaneously, as shown in Figure \ref{DataConfigurations}. In configuration 1), only the host-pathogen tensor $\mathbf{\underline{X}}$ is factorized. In configuration 2), genetic proximity matrix $\mathbf{Z}$ is coupled with the host-pathogen tensor $\mathbf{\underline{X}}$ in the second mode, incorporating the genetic distance to domain knowledge. In configuration 3), spatial proximity matrix $\mathbf{Y}$ is coupled with the host-pathogen tensor $\mathbf{\underline{X}}$ in the first mode, incorporating the spatial distance to domain knowledge. In configuration 4), spatial proximity matrix $\mathbf{Y}$ and genetic proximity matrix $\mathbf{Z}$ are coupled with the host-pathogen tensor $\mathbf{\underline{X}}$ in the first and second mode respectively, incorporating both the spatial distance and genetic distance to domain knowledge. In configuration 5), the host-pathogen tensor $\mathbf{\underline{X}}$ is contracted along the time mode, keeping the spatial proximity matrix $\mathbf{Y}$ and genetic proximity matrix $\mathbf{Z}$ coupled to the contracted host-pathogen tensor, which is now the matrix $\mathbf{X'}$. We use all 5 configurations when clustering in the patient mode and host mode, to test the effect of distance measures and time on detected groups.

\begin{figure}[!t]
\centering
\includegraphics[width=3.5in]{Figures/DataConfigurationsWithBorder.pdf}
\caption{Data configurations}
\label{DataConfigurations}
\end{figure}

\subsubsection{Coupled matrix-tensor factorization}
Here, we focus on step two of CMTCF, which is the coupled matrix-tensor factorization step. Next, we give the details of CMTF\_PARAFAC\_ALS and CMTF\_Tucker\_ALS for coupled matix-tensor factorization. We also give the details of CMMF\_ALS for the coupled matrix-matrix decomposition in data configuration 5 shown in Figure \ref{DataConfigurations}.

\paragraph{\textbf{CMTF\_PARAFAC\_ALS}}
Given the host-pathogen tensor $\mathbf{\underline{X}} \in \mathbb{R}^{I \times J \times K}$ coupled with spatial proximity matrix $\mathbf{Y} \in \mathbb{R}^{I \times I}$ and genetic proximity matrix $\mathbf{Z} \in \mathbb{R}^{J \times J}$, we approximate them as follows:

\begin{align}\label{ApproximationPARAFAC}
\mathbf{X_{(1)}} &\approx \mathbf{A} \left(\mathbf{C} \odot \mathbf{B}\right)'& \nonumber\\
\mathbf{Y} &\approx \mathbf{A} \mathbf{V}' & \nonumber \\
\mathbf{Z} &\approx \mathbf{B} \mathbf{W}' &
\end{align}

\noindent where $\odot$ denotes Khatri-Rao product. The loss function to minimize is the sum of squared Frobenius norm of each data block:

\begin{align} \label{LossFunction}
\min_{A,B,C,V,W} & ||\mathbf{X_{(1)}} - \mathbf{A} \left(\mathbf{C} \odot \mathbf{B}\right)'||_{F}^{2} +
                   ||\mathbf{Y} - \mathbf{A} \mathbf{V}'||_{F}^{2} +
                   ||\mathbf{Z} - \mathbf{B} \mathbf{W}'||_{F}^{2} \nonumber \\
\min_{A,B,C,V,W} & L(\mathbf{\underline{X}}, \mathbf{Y}, \mathbf{Z}, \mathbf{A}, \mathbf{B}, \mathbf{C}, \mathbf{V}, \mathbf{W})
\end{align}

CMTF\_PARAFAC\_ALS is also known as CMTF\_ALS algorithm in the literature, which is detailed in earlier studies \cite{CMTF_OPT}. Therefore, we skip the details of the algorithm, and only focus on the update step for each factor matrix. Minimization for $\mathbf{A},\mathbf{B},\mathbf{C},\mathbf{V},\mathbf{W}$ alternately returns the following updates at each step of CMTF\_PARAFAC\_ALS: \\

\noindent Update for $\mathbf{A}$:
\begin{align*}
\min_{A} & ||\mathbf{X_{(1)}} - \mathbf{A} \left(\mathbf{C} \odot \mathbf{B}\right)'||_{F}^{2} +
           ||\mathbf{Y} - \mathbf{A} \mathbf{V}'||_{F}^{2} \\
\min_{A} & ||\underbrace{[\mathbf{X_{(1)}} \,\, \mathbf{Y}]}_{T} - \mathbf{A} \underbrace{\left[\left(\mathbf{C} \odot \mathbf{B}\right)' \,\, \mathbf{V}'\right]}_{K}||_{F}^{2} \\
\Longrightarrow &\mathbf{A} = \left(\mathbf{T} \mathbf{K}'\right) / \left(\mathbf{K} \mathbf{K}'\right)
\end{align*}

\noindent Update for $\mathbf{B}$:
\begin{align*}
\min_{B} & ||\mathbf{X_{(2)}} - \mathbf{B} \left(\mathbf{C} \odot \mathbf{A}\right)'||_{F}^{2} +
           ||\mathbf{Z} - \mathbf{B} \mathbf{W}'||_{F}^{2} \\
\min_{B} & ||\underbrace{[\mathbf{X_{(2)}} \,\, \mathbf{Z}]}_{T} - \mathbf{B} \underbrace{\left[\left(\mathbf{C} \odot \mathbf{A}\right)' \,\, \mathbf{W}'\right]}_{K}||_{F}^{2} \\
\Longrightarrow &\mathbf{B} = \left(\mathbf{T} \mathbf{K}'\right) / \left(\mathbf{K} \mathbf{K}'\right)
\end{align*}

\noindent Update for $\mathbf{C}$:
\begin{align*}
\min_{C} & ||\underbrace{\mathbf{X_{(3)}}}_{T} - \mathbf{C} \underbrace{\left(\mathbf{B} \odot \mathbf{A}\right)'}_{K}||_{F}^{2}\\
\Longrightarrow &\mathbf{C} = \left(\mathbf{T} \mathbf{K}'\right) / \left(\mathbf{K} \mathbf{K}'\right)
\end{align*}

\noindent Update for $\mathbf{V}$:
\begin{align*}
\min_{V} & ||\mathbf{Y} - \mathbf{A} \mathbf{V}'||_{F}^{2} \\
\Longrightarrow &\mathbf{V} = \left(\left(\mathbf{A}' \mathbf{A}\right) \backslash \left(\mathbf{A}' \mathbf{Y}\right)\right)'
\end{align*}

\noindent Update for $\mathbf{W}$:
\begin{align*}
\min_{W} & ||\mathbf{Z} - \mathbf{B} \mathbf{W}'||_{F}^{2} \\
\Longrightarrow &\mathbf{W} = \left(\left(\mathbf{B}' \mathbf{B}\right) \backslash \left(\mathbf{B}' \mathbf{Z}\right)\right)'
\end{align*}

\paragraph{\textbf{CMTF\_Tucker\_ALS}}
The host-pathogen tensor $\mathbf{\underline{X}} \in \mathbb{R}^{I \times J \times K}$, spatial proximity matrix $\mathbf{Y} \in \mathbb{R}^{I \times I}$ and genetic proximity matrix $\mathbf{Z} \in \mathbb{R}^{J \times J}$ are approximated as in the system of equations (\ref{ApproximationTucker}). Note that in the Tucker3 model, the factor matrices $\mathbf{A}$, $\mathbf{B}$, $\mathbf{C}$ are orthogonal. Then, tensor $\mathbf{\underline{X}} \in \mathbb{R}^{I \times J \times K}$ can be decomposed using a (P,Q,R) Tucker3 model, while simultaneously factorizing $\mathbf{Y} \in \mathbb{R}^{I \times I}$ and $\mathbf{Z} \in \mathbb{R}^{I \times I}$ with the factor matrix of shared mode. The loss function in Equation (\ref{LossFunction}) will be alternately minimized for $\mathbf{A},\mathbf{B},\mathbf{C},\mathbf{V},\mathbf{W}$.

\begin{align}\label{ApproximationTucker}
\mathbf{X_{(1)}} &\approx \mathbf{A} \mathbf{G_{(1)}} \left(\mathbf{C}' \otimes \mathbf{B}'\right)& \nonumber\\
\mathbf{Y} &\approx \mathbf{A} \mathbf{V}' & \nonumber \\
\mathbf{Z} &\approx \mathbf{B} \mathbf{W}' &
\end{align}

\noindent The steps of minimization of the loss function for $\mathbf{A}$ is included in the Appendix. The derivative of $L$ with respect to $\mathbf{A}$ set to zero returns the following equation:

\begin{equation*}
\left(\mathbf{M} \mathbf{M}' + \mathbf{Y} \mathbf{Y}'\right) \mathbf{A} = \lambda \mathbf{A}
\end{equation*}

\noindent which means that $\mathbf{A}$ is composed of first $I$ eigenvectors of $\left(\mathbf{M} \mathbf{M}' + \mathbf{Y} \mathbf{Y}'\right)$. We denote it as follows:

\begin{align*}
\mathbf{M} &= \mathbf{X_{(1)}} \left(\mathbf{C}\mathbf{C}' \otimes \mathbf{B}\mathbf{B}'\right) \\
\mathbf{A} &= \text{EVD}\left(\mathbf{M} \mathbf{M}' + \mathbf{Y} \mathbf{Y}', I \right)
\end{align*}

\noindent Similarly for the other coupled mode, $\mathbf{B}$ is found as follows:

\begin{align*}
\mathbf{M_2} &= \mathbf{X_{(2)}} \left(\mathbf{C}\mathbf{C}' \otimes \mathbf{A}\mathbf{A}'\right) \\
\mathbf{B} &= \text{EVD}\left(\mathbf{M_2} \mathbf{M_2}' + \mathbf{Z} \mathbf{Z}', J \right)
\end{align*}

\noindent And for the uncoupled mode, $\mathbf{C}$ is found using SVD as in Tucker3-ALS algorithm:

\begin{align*}
\mathbf{M_3} &= \mathbf{X_{(3)}} \left(\mathbf{B}\mathbf{B}' \otimes \mathbf{A}\mathbf{A}'\right) \\
\mathbf{C} &= \text{SVD}\left(\mathbf{M_3}, K \right)
\end{align*}

\paragraph{\textbf{CMMF\_ALS}}
The host-pathogen tensor contracted along the time mode becomes the matrix $\mathbf{\underline{X}} \in \mathbb{R}^{I \times J \times J}$. Given spatial proximity matrix
$\mathbf{Y} \in \mathbb{R}^{I \times I}$ and genetic proximity matrix $\mathbf{Z} \in \mathbb{R}^{J \times J}$ are approximated as in the system of equations (\ref{ApproximationCMMF}).
We initialize the factor matrices using truncated SVD, and then alternately minimize the loss function will be alternately minimized for $\mathbf{A},\mathbf{B},\mathbf{V},\mathbf{W}$.

\begin{align}\label{ApproximationCMMF}
\mathbf{X} &\approx \mathbf{A} \mathbf{B}' & \nonumber\\
\mathbf{Y} &\approx \mathbf{A} \mathbf{V}' & \nonumber \\
\mathbf{Z} &\approx \mathbf{B} \mathbf{W}' &
\end{align}

\noindent Minimizing the loss function for $\mathbf{A},\mathbf{B},\mathbf{V},\mathbf{W}$ after fixing other factor matrices, we update the matrices as follows in CMMF:

\begin{align*}
\mathbf{A} &= \left(\mathbf{X} \mathbf{B} + \mathbf{Y} \mathbf{V}\right) \backslash \left(\mathbf{B}' \mathbf{B} + \mathbf{V}' \mathbf{V} \right) \\
\mathbf{B} &= \left(\mathbf{X}' \mathbf{A} + \mathbf{Z} \mathbf{W}\right) \backslash \left(\mathbf{A}' \mathbf{A} + \mathbf{W}' \mathbf{W} \right) \\
\mathbf{V} &= \left(\mathbf{Y}' \mathbf{A}\right) \backslash \left(\mathbf{A}' \mathbf{A}\right) \\
\mathbf{W} &= \left(\mathbf{Z}' \mathbf{B}\right) \backslash \left(\mathbf{B}' \mathbf{B}\right)
\end{align*}

\subsection{Host-pathogen association analysis}
For host-pathogen association analysis, we used spatial proximity matrix, genetic proximity matrix. Let $\mathbf{\underline{X}}$ denote the host-pathogen tensor. Let ($p_1$,$s_1$) and ($p_2$,$s_2$) be two patient-strain pairs, and $T_1$ and $T_2$ be their time profiles respectively, such that $T_1 = \mathbf{\underline{X}}(p_1,s_1,.)$ and $T_2 =\mathbf{\underline{X}}(p_2,s_2,.)$. We assume that there is an association between patient pairs ($p_1$,$p_2$) and strain pairs ($s_1$,$s_2$) if the following conditions hold:

\begin{align*}
& P_S\left(C(p_1), C(p_2)\right) \geq 0.5 \\
& P_G\left(S(s_1), S(s_2)\right) \geq 0.5 \\
& \sum_{t \in T_1} t \geq th \text{, } \sum_{t \in T_2} t \geq th \\
& \text{JSD}(T_1,T_2) < 0.01  \\
\end{align*}

\noindent where JSD$(T_1, T_2)$ denotes the Jensen-Shannon Divergence between time profiles of patient-strain pairs ($p_1$,$s_1$) and ($p_2$,$s_2$), and $th$ is the threshold population size for a genetic cluster. The first condition states that the country of birth of patients must be spatially close. The second condition states that the MTBC strains which infected these patients must be genetically close. The third condition states that there must be at least $th$ patients from country $C(p_1)$ infected with strain $s_1$, and at least $th$ patients from country $C(p_2)$ infected with strain $s_2$. The final condition states that the distribution of time profiles of patient-strain pairs must be close.

\section{Results}
Based on various data configurations composed of host-pathogen tensor, spatial distance matrix, and genetic distance matrix, we clustered spoligotypes in the host mode and coutries in the patient mode. Finally, we compared the time profiles of host-pathogen pairs consisting of spatially close patients infected genetically close MTBC strains, and found patient-strain groups with similar trends. In these experiments, for each data configuration in Figure \ref{DataConfigurations}, we used the following clustering frameworks. For data configuration 1), we used the tensor Clustering Framework (TCF) by Ozcaglar et al. \cite{TCF_MBT_BMC}. For data configuration 2), 3) and 4), we used the Coupled Matrix-Tensor Clustering Framework modifications CMCTF$_2$, CMCTF$_1$ and CMCTF$_{1,2}$ respectively, where the subscripts denote the coupled modes. For data configuration 5), we used the Coupled Matrix-Matrix Clustering Framework (CMMCF). Next, we move on to clustering of spoligotypes in the pathogen mode.

\subsection{Spoligotype mode analysis}
We clustered 353 distinct spoligotypes via different data configurations using TCF, CMTCF and CMMCF. We analyzed the results based on PARAFAC and Tucker3 models separately. For TCF, PARAFAC and Tucker3 model are used separately, and for CMTCF CMTF\_PARAFAC\_ALS and CMTF\_Tucker\_ALS are used seperately. For CMMCF, there is no such division, because tensor decomposition is not used. Spoligotypes are labeled by 6 major lineages defined by CBN \cite{CBN}, and we use it as the ground truth to calculate F-measure.

Figures \ref{SpolMode_PARAFACvars} and \ref{SpolMode_Tucker3vars} show the change in F-measure with number of clusters, using PARAFAC variants and Tucker3 variants of clustering frameworks respectively. Based on the plots, when spoligotypes are clustered using TCF on the host-pathogen tensor with PARAFAC and Tucker3 models, the F-measure is in the range [0.12,0.13], which is the lowest. Therefore, TCF perform worst for clustering spoligoytpes.

When only genetic proximity matrix is coupled with the host-pathogen tensor in the second mode, and when spoligotypes are clustered on this data configuration using CMTCF$_2$, CMTF\_PARAFAC\_ALS shows a downward trend in F-measure as the number of clusters increase. On the other hand, CMTF\_Tucker\_ALS in CMTCF$_2$ performs better and returns F-measure values in the range [0.6, 0.7], as the number of clusters increase. When only spatial proximity matrix is coupled with the host-pathogen tensor in the first mode, spoligotypes are clustered on this data configuration using CMTCF$_1$. In this case, CMTF\_PARAFAC\_ALS and CMTF\_Tucker\_ALS return equally high F-measure values, but CMTF\_Tucker\_ALS returns slightly higher F0measure values. When both genetic and spatial proximity matrices are coupled with the host-pathogen tensor, spoligotypes are clustered using CMTCF$_{1,2}$. In this case, CMTCF$_{1,2}$ based on CMTF\_Tucker\_ALS returns F-measure values in the range [0.6, 0.7], which outperforms CMTCF$_{1,2}$ based on CMTF\_PARAFAC\_ALS as the number of clusters increase. Finally, to test if time information is needed, we contract the host-pathogen tensor along the time mode, and cluster spoligotypes using coupled matrix-matrix factorization via CMMF\_ALS in CMMCF. There are no tensors in this data configuration, therefore no tensor decomposition methods are used in CMMCF. As a result, the lines for CMMCF in Figures \ref{SpolMode_PARAFACvars} and \ref{SpolMode_Tucker3vars} are the same, and they are the result of CMMF\_ALS. Using this data configuration, we eliminate time information. As we can see in the plots, CMMCF returns high F-measure values, comparable to CMTCF variants, as long as number of clusters $k<=8$. For higher number of clusters, the F-measure values based on CMMCF decrease drastically.

\begin{figure*}[!t]
\centerline{
\subfloat[TCF with PARAFAC, CMTCF with CMTF\_PARAFAC\_ALS, and CMMCF on spoligoytpes]{\includegraphics[width=3.4in]{Figures/SpolMode_PARAFACvars.pdf}
\label{SpolMode_PARAFACvars}}
\hfil
\subfloat[TCF with Tucker3, CMTCF with CMTF\_Tucker\_ALS, and CMMCF on spoligoytpes]{\includegraphics[width=3.4in]{Figures/SpolMode_Tucker3vars.pdf}
\label{SpolMode_Tucker3vars}}
}
\caption{Spoligotype clustering results with TCF variants, CMTCF variants, and CMMCF}
\label{SpolModeAnalysis}
\end{figure*}

Spoligotype clustering results based on different clustering frameworks gives insights about host-pathogen association. When TCF on the host-pathogen tensor is used, the lineage predictions of spoligotypes are not accurate. When we incorporate spatial proximity matrix, CMTF\_PARAFAC\_ALS in CMTCF performs good, while coupling genetic proximity matrix and clustering spoligotypes using CMTF\_PARAFAC\_ALS in CMTCF returns low F-measure values. On the other hand, coupling at least one of the spatial distance matrix and genetic distance matrix and clustering via CMTF\_Tucker\_ALS in CMTCF returns the highest and equally good F-measure values. This shows that addition of spatial or genetic distance matrices into the domain knowledge and factorizing via CMTF\_Tucker\_ALS in CMTCF improves clustering accuracy. Finally, we contract the tensor and observe that CMMCF returns high clustering accuracy for low number of clusters, whereas the accuracy decreases as the number of clusters increase. Therefore, time information is needed to cluster spoligotypes accurately. Overall, the best clustering accuracy is attained when CMTCF is used with CMTF\_Tucker\_ALS on host-pathogen tensor coupled with at least one of the spatial proximity matrix and genetic proximity matrix.

\subsection{Patient mode analysis}

We clustered 111 distinct countries of birth via different data configurations using TCF, CMTCF and CMMCF. We analyzed the results based on PARAFAC and Tucker3 models separately. For TCF, PARAFAC and Tucker3 model are used separately, and for CMTCF, CMTF\_PARAFAC\_ALS and CMTF\_Tucker\_ALS are used seperately. For CMMCF, there is no such division, because tensor decomposition is not used. There is no \emph{a priori} known clustering of these countries, therefore external cluster validation measures can not be used. Instead, we used an internal measure, best-match stability, to evaluate the clusterings.

Figures \ref{PatientMode_PARAFACvars} and \ref{PatientMode_Tucker3vars} show the change in best-match stability values with number of clusters, using PARAFAC variants and Tucker3 variants of clustering frameworks respectively. Overall, clusterings of Tucker3 and its variants under various clustering frameworks are more stable than clusterings of PARAFAC variants. Cluster analysis of this mode reveals outliers, since patients are heavily from specific countries. TCF on the host-pathogen tensor using PARAFAC model returns two clusters, one of which is the United States, and the other cluster consists of all other countries. As the number of clusters increase, China, Ecuador, Mexico, India, Philippines, Guyana, Peru, Haiti are grouped in their own cluster, but the clusterings become less stable. The same outlier are found in PARAFAC variants of CMTCF$_1$, CMTCF$_2$, and CMMCF. When CMTCF$_{1,2}$ with CMTF\_PARAFAC\_ALS is used, the clusters are larger, but unrelated.

Tucker3 variants of clustering frameworks find more stable clusters with reasonable groups of countries, except TCF and CMTCF$_2$ which detect outlier countries. When countries are clustered via CMTCF$_1,2$ using CMTF\_Tucker\_ALS, the most stable clustering is obtained when there are 10 clusters, and these clusters consist of small and reasonable groups. Guatemala and Panama are grouped together, and they are both in Central America, close to each other. Armenia, Georgia, Serbia Montenegro are three European countries close to each other, and they are grouped together. Moreover, Armenia and Georgia are nieghbours. Brazil, Ecuador and Nicaragua are in one cluster. The length of shortest path from Brazil to Ecuador is 2. Colombia and Honduras are grouped together, which are close to Central America. The length of shortest path from Colombia to Honduras is greater than 3, but the observed spoligotypes in patients from these countries are genetically close. Argentina, Peru and Venezuela are three countries in South America which are clustered together. Interestingly, the length of the shortest path between any pair in this group is 2. Greece and Ukraine are two countries in Europe, and they are grouped together. The length of the shortest path between them is greater than 3. Albania, Bosnia Herzegovina, China, Italy and Russia are grouped together. The length of the shortest path between Albania and Bosnia and Herzegovina is 2, so they are close. The length of the shortest path between China and Russia is 1, so they are close as well. But Italy is not spatially close to these countries, an MTBC strain must have been spread via migration.

\begin{figure*}[!t]
\centerline{
\subfloat[TCF with PARAFAC, CMTCF with CMTF\_PARAFAC\_ALS, and CMMCF on country of birth of patients]{\includegraphics[width=3.4in]{Figures/PatientMode_PARAFACvars.pdf}
\label{PatientMode_PARAFACvars}}
\hfil
\subfloat[TCF with Tucker3, CMTCF with CMTF\_Tucker\_ALS, and CMMCF on country of birth of patients]{\includegraphics[width=3.4in]{Figures/PatientMode_Tucker3vars.pdf}
\label{PatientMode_Tucker3vars}}
}
\caption{Country of birth clustering results with TCF variants, CMTCF variants, and CMMCF}
\label{PatientModeAnalysis}
\end{figure*}

\subsection{Host-pathogen association}
In order to find host-pathogen associations, we compared patient-strain pairs based on spatial proximity of patients, genetic proximity of MTBC strains, and time profiles of patient-strain pairs using the conditions described in the methods section. We changed the value of threshold population size $th$ of genetic clusters, and compared patient-strain pairs.

When the threshold population size is $th=10$, we obtain three pairs of similar patient-strain pairs. Table \ref{PatientStrainPairs_th10} lists these three pairs of patient-strain groups. From the table, we see that MTBC strains with shared type ST1 infects patients from China and India with a similar time trend. In addition, time trend of infections with MTBC strains genotyped with ST1 have a similar time trend in patients from Dominican Republic and Haiti, and also Mexico and United States. Notice that these countries are neighbours, as restricted using the spatial distance matrix. Figures \ref{ST1_ChinaIndia}, \ref{ST1_DomHaiti}, \ref{ST1_MexicoUS} show the number of patients infected with MTBC strains genotyped by ST1 between 2001 and 2007, in China and India, in Dominican Republic and Haiti, in Mexico and United States respectively.

\begin{table}[!]
\centering
\begin{tabular}{|c|c||c|c|}
\cline{1-4}
\multicolumn{2}{|c|}{Pair 1} & \multicolumn{2}{|c|}{Pair 2}  \\ \hline \hline
Country of birth  & Strain & Country of birth & Strain \\ \hline \cline{1-4}
China  & ST1 & India & ST1 \\ \cline{1-4}
Dominican Republic  & ST1 & Haiti & ST1 \\ \cline{1-4}
Mexico  & ST1 & United States & ST1 \\ \cline{1-4}
\end{tabular}
\caption{Patient-strain pairs with similar time profiles.}
\label{PatientStrainPairs_th10}
\end{table}

\begin{figure*}[!]
\centerline{
\subfloat[ST1 in China vs. India]{\includegraphics[width=2.5in]{Figures/S00034_ChinaIndia.pdf}
\label{ST1_ChinaIndia}}
\hfil
\subfloat[ST1 in Dominican Republic vs. Haiti]{\includegraphics[width=2.5in]{Figures/S00034_DomHaiti.pdf}
\label{ST1_DomHaiti}}
\hfil
\subfloat[ST1 in Mexico vs. United States]{\includegraphics[width=2.5in]{Figures/S00034_MexicoUS.pdf}
\label{ST1_MexicoUS}}
}
\caption{Time trend of ST1 in neighbour countries}
\label{ST1_th10}
\end{figure*}



\begin{figure*}[!]
\centerline{                                                                                               
\subfloat[ST50 in China vs. India]{\includegraphics[width=2.5in]{Figures/ST50_ChinaIndia.pdf}              
\label{ST50_ChinaIndia}}                                                                                   
\hfil                                                                                                      
\subfloat[ST53 in China vs. ST50 in India]{\includegraphics[width=2.5in]{Figures/ST53_ST50_ChinaIndia.pdf} 
\label{ST53_ST50_ChinaIndia}}                                                                              
}                                                                                                          
\caption{Time trend of various strains in China and India when threshold population size $th=8$ is used.}  
\label{th8_ChinaIndia}                                                                                     
\end{figure*}                                                                                              










When we drop the threshold population size to $th=8$, we obtain 10 additional pairs of similar patient-strain pairs in addition to the ones listed in Table \ref{PatientStrainPairs_th10}.

\begin{table}[!t]
\centering
\begin{tabular}{|c|c||c|c|}
\cline{1-4}
\multicolumn{2}{|c|}{Pair 1} & \multicolumn{2}{|c|}{Pair 2}  \\ \hline \hline
Country of birth  & Strain & Country of birth & Strain \\ \hline \cline{1-4}
China &	ST1	& Nepal & ST1 \\ \cline{1-4}
China &	ST1	& Pakistan & ST1 \\ \cline{1-4}
China & ST50 & India & ST50 \\ \cline{1-4}
China & ST53 & India & ST50 \\ \cline{1-4} %%%****
India & ST1 & Nepal & ST1 \\ \cline{1-4}
India & ST1 & Pakistan & ST1 \\ \cline{1-4}
Mexico & ST47 & United States & ST47 \\ \cline{1-4}
Mexico & ST53 & United States & ST119 \\ \cline{1-4}
Mexico & ST53 & United States & ST50 \\ \cline{1-4}
Mexico & ST53 & United States & ST53 \\ \cline{1-4}
\end{tabular}
\caption{Patient-strain pairs with similar time profiles.}
\label{PatientStrainPairs_th8}
\end{table}

\begin{figure*}[!]
\centerline{
\subfloat[ST47 in Mexico vs. United States]{\includegraphics[width=2.5in]{Figures/ST47_MexicoUS.pdf}
\label{ST47_MexicoUS}}
\hfil
\subfloat[ST53 in Mexico vs. ST119 in United States]{\includegraphics[width=2.5in]{Figures/ST53_ST119_MexicoUS.pdf}
\label{S53_ST119_MexicoUS}}
\hfil
\subfloat[ST53 in Mexico vs. ST50 in United States]{\includegraphics[width=2.5in]{Figures/ST53_ST50_MexicoUS.pdf}
\label{ST53_ST50_MexicoUS}}
\hfil
\subfloat[ST53 in Mexico vs. United States]{\includegraphics[width=2.5in]{Figures/ST53_MexicoUS.pdf}
\label{ST53_MexicoUS}}
}
\caption{Time trend of various strains in Mexico and United States when threshold population size $th=8$ is used.}
\label{th8_MexicoUS}
\end{figure*}

%\begin{figure*}[!]
%\centerline{
%\subfloat[ST47 in Mexico vs. United States]{\includegraphics[width=2.5in]{Figures/ST47_MexicoUS.pdf}
%\label{ST47_MexicoUS}}
%\hfil
%\subfloat[ST53 in Mexico vs. ST119 in United States]{\includegraphics[width=2.5in]{Figures/ST53_ST119_MexicoUS.pdf}
%\label{S53_ST119_MexicoUS}}
%}
%\caption{Time trend of various strains in Mexico and United States when threshold population size $th=8$ is used.}
%\label{th8_MexicoUS}
%\end{figure*}
%
%%\newpage
%\pagebreak
%\begin{figure*}[!]
%\centerline{
%\subfloat[ST53 in Mexico vs. ST50 in United States]{\includegraphics[width=2.5in]{Figures/ST53_ST50_MexicoUS.pdf}
%\label{ST53_ST50_MexicoUS}}
%\hfil
%\subfloat[ST53 in Mexico vs. United States]{\includegraphics[width=2.5in]{Figures/ST53_MexicoUS.pdf}
%\label{ST53_MexicoUS}}
%}
%\caption{Time trend of various strains in Mexico and United States when threshold population size $th=8$ is used.}
%\label{th8_MexicoUS}
%\end{figure*}

\section{Discussion and Conclusion}

\newpage
\section*{Appendix}
\subsection*{\textbf{CMTF\_Tucker\_ALS}}

Update for $\mathbf{A}$:

\begin{figure*}
\begin{align*}
\min_{A} & ||\mathbf{X_{(1)}} - \mathbf{A} \mathbf{G_{(1)}} \left(\mathbf{C}' \otimes \mathbf{B}'\right)||_{F}^{2} +
           ||\mathbf{Y} - \mathbf{A} \mathbf{V}'||_{F}^{2} \\
\min_{A} & ||\left[\mathbf{X_{(1)}} \,\, \mathbf{Y}\right] - \left[\mathbf{A} \mathbf{G_{(1)}} \left(\mathbf{C}' \otimes \mathbf{B}'\right) \,\, \mathbf{A}\mathbf{V}'\right]||_{F}^{2} \\
\min_{A} & ||\left[\mathbf{X_{(1)}} \,\, \mathbf{Y}\right] - \left[\mathbf{A} \mathbf{A}' \underbrace{\mathbf{X_{(1)}} \left(\mathbf{C}\mathbf{C}' \otimes \mathbf{B}\mathbf{B}'\right)}_{\mathbf{M}} \,\, \mathbf{A}\mathbf{V}'\right]||_{F}^{2} \\
\min_{A} & ||\left[\mathbf{X_{(1)}} \,\, \mathbf{Y}\right] - \left[\mathbf{A} \mathbf{A}' \mathbf{M} \,\, \mathbf{A}\mathbf{V}'\right]||_{F}^{2} \\
\min_{A} & +\text{tr}\left(\left(\left[\mathbf{X_{(1)}} \,\, \mathbf{Y}\right] - \left[\mathbf{A} \mathbf{A}' \mathbf{M} \,\, \mathbf{A}\mathbf{V}'\right]\right) \left(\left[\mathbf{X_{(1)}} \,\, \mathbf{Y}\right]' - \left[\mathbf{A} \mathbf{A}' \mathbf{M} \,\, \mathbf{A}\mathbf{V}'\right]'\right) \right) \\
\min_{A} & +\text{tr}\left( \left[\mathbf{X_{(1)}} \,\, \mathbf{Y}\right] \left[\mathbf{X_{(1)}} \,\, \mathbf{Y}\right]'\right)
           -2\text{tr}\left( \left[\mathbf{X_{(1)}} \,\, \mathbf{Y}\right] \left[\mathbf{M}' \mathbf{A} \mathbf{A}' \,\,\text{;} \,\, \mathbf{V}\mathbf{A}'\right]\right)
           +\text{tr}\left( \left[\mathbf{A} \mathbf{A}' \mathbf{M} \,\, \mathbf{A}\mathbf{V}'\right] \left[\mathbf{M} \mathbf{A} \mathbf{A}' \,\,\text{;} \,\, \mathbf{V}\mathbf{A}'\right]\right) \\
\min_{A} & -2\text{tr}\left( \left[\mathbf{X_{(1)}} \,\, \mathbf{Y}\right] \left[\mathbf{M}' \mathbf{A} \mathbf{A}' \,\,\text{;} \,\, \mathbf{V}\mathbf{A}'\right]\right)
           +\text{tr}\left( \mathbf{A} \mathbf{A}' \mathbf{M} \mathbf{M}' \mathbf{A} \mathbf{A}' + \mathbf{A} \mathbf{V}' \mathbf{V} \mathbf{A}'\right) \\
\min_{A} & -2\text{tr}\left( \mathbf{X_{(1)}}\mathbf{M}' \mathbf{A} \mathbf{A}' + \mathbf{Y} \mathbf{V}\mathbf{A}'\right)
           +\text{tr}\left( \mathbf{A} \mathbf{A}' \mathbf{M} \mathbf{M}' \mathbf{A} \mathbf{A}' + \mathbf{A} \mathbf{V}' \mathbf{V} \mathbf{A}'\right) \\
\min_{A} & -2\text{tr}\left( \mathbf{X_{(1)}}\mathbf{M}' \mathbf{A} \mathbf{A}'\right) -2\text{tr}\left(\mathbf{Y} \mathbf{V}\mathbf{A}'\right)
           +\text{tr}\left( \mathbf{A} \mathbf{A}' \mathbf{M} \mathbf{M}' \mathbf{A} \mathbf{A}'\right) + +\text{tr}\left(\mathbf{A} \mathbf{V}' \mathbf{V} \mathbf{A}'\right) \\
\min_{A} & -2\text{tr}\left( \mathbf{M}\mathbf{M}' \mathbf{A} \mathbf{A}'\right) -2\text{tr}\left(\mathbf{Y} \mathbf{Y}' \mathbf{A} \mathbf{A}'\right)
           +\text{tr}\left( \mathbf{A}' \mathbf{M} \mathbf{M}' \mathbf{A} \right) + +\text{tr}\left(\mathbf{A} \mathbf{A}' \mathbf{Y} \mathbf{Y}' \mathbf{A} \mathbf{A}' \right) \\
\min_{A} & -2\text{tr}\left( \mathbf{A}' \mathbf{M}\mathbf{M}' \mathbf{A} \right) -2\text{tr}\left(\mathbf{A}' \mathbf{Y} \mathbf{Y}' \mathbf{A} \right)
           +\text{tr}\left( \mathbf{A}' \mathbf{M} \mathbf{M}' \mathbf{A} \right) + +\text{tr}\left(\mathbf{A}' \mathbf{Y} \mathbf{Y}' \mathbf{A} \right)\\
\min_{A} & -\text{tr}\left( \mathbf{A}' \mathbf{M}\mathbf{M}' \mathbf{A} \right) -\text{tr}\left(\mathbf{A}' \mathbf{Y} \mathbf{Y}' \mathbf{A} \right) \\
\text{s.t} & \,\,\, \mathbf{A}'\mathbf{A} = \mathbf{I}
%\Longrightarrow &\mathbf{A} = \left(\mathbf{T} \mathbf{K}'\right) / \left(\mathbf{K} \mathbf{K}'\right)
\end{align*}
\end{figure*}

\noindent Lagrangian of this function is:

\begin{equation*}
L = -\text{tr}\left( \mathbf{A}' \mathbf{M}\mathbf{M}' \mathbf{A} \right) -\text{tr}\left(\mathbf{A}' \mathbf{Y} \mathbf{Y}' \mathbf{A} \right)
    + \lambda \left(\text{tr}\left(\mathbf{A}' \mathbf{A} - \mathbf{I} \right) \right)\\
\end{equation*}

\section*{Acknowledgments}

\ifCLASSOPTIONcaptionsoff
  \newpage
\fi


% IEEEtranS
\bibliographystyle{ieeetr} % ieeetr - if you want the order to be the order of citation
\bibliography{VisPaperBib}

% that's all folks
\end{document}


